In [2]:
# GENDER AGENDAS MAPPER
# V3 - July 2025
# Developed by the Gender Justice Data Hub
# CC BY-NC-SA 4.0, Global Fund for Women
In [4]:
# Uninstall everything related
!pip uninstall -y torch torchvision torchaudio transformers sentence-transformers bertopic umap-learn hdbscan accelerate bitsandbytes xformers

# Clear pip cache
!pip cache purge

# First install PyTorch with CUDA
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124

# Then the base dependencies
!pip install transformers==4.35.2 accelerate bitsandbytes

# Install sentence-transformers before BERTopic
!pip install sentence-transformers

# Finally install BERTopic and its dependencies
!pip install bertopic umap-learn hdbscan adjustText
Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Successfully uninstalled torch-2.2.2
WARNING: Skipping torchvision as it is not installed.
WARNING: Skipping torchaudio as it is not installed.
Found existing installation: transformers 4.53.2
Uninstalling transformers-4.53.2:
  Successfully uninstalled transformers-4.53.2
Found existing installation: sentence-transformers 5.0.0
Uninstalling sentence-transformers-5.0.0:
  Successfully uninstalled sentence-transformers-5.0.0
Found existing installation: bertopic 0.17.3
Uninstalling bertopic-0.17.3:
  Successfully uninstalled bertopic-0.17.3
Found existing installation: umap-learn 0.5.9.post2
Uninstalling umap-learn-0.5.9.post2:
  Successfully uninstalled umap-learn-0.5.9.post2
Found existing installation: hdbscan 0.8.40
Uninstalling hdbscan-0.8.40:
  Successfully uninstalled hdbscan-0.8.40
Found existing installation: accelerate 1.8.1
Uninstalling accelerate-1.8.1:
  Successfully uninstalled accelerate-1.8.1
Found existing installation: bitsandbytes 0.42.0
Uninstalling bitsandbytes-0.42.0:
  Successfully uninstalled bitsandbytes-0.42.0
WARNING: Skipping xformers as it is not installed.
Files removed: 68
Looking in indexes: https://download.pytorch.org/whl/cu124
ERROR: Could not find a version that satisfies the requirement torch==2.6.0 (from versions: none)
ERROR: No matching distribution found for torch==2.6.0
Collecting transformers==4.35.2
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 4.1 MB/s eta 0:00:00
Collecting accelerate
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (3.13.1)
Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.33.4)
Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (1.26.4)
Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (6.0.1)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2023.10.3)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2.32.3)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2)
  Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Requirement already satisfied: safetensors>=0.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.5.2)
Requirement already satisfied: tqdm>=4.27 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (4.65.0)
Requirement already satisfied: psutil in /opt/anaconda3/lib/python3.11/site-packages (from accelerate) (5.9.0)
Collecting torch>=2.0.0 (from accelerate)
  Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from bitsandbytes) (1.11.4)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (2023.6.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (4.14.0)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (1.1.5)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2025.4.26)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=2.0.0->accelerate) (2.1.3)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=2.0.0->accelerate) (1.3.0)
Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.9/7.9 MB 11.3 MB/s eta 0:00:0000:0100:01
Downloading accelerate-1.9.0-py3-none-any.whl (367 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 367.1/367.1 kB 7.9 MB/s eta 0:00:00:00:01
Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.0/105.0 MB 5.5 MB/s eta 0:00:0000:0100:01
Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl (2.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 3.3 MB/s eta 0:00:00a 0:00:01
Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl (150.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.8/150.8 MB 5.2 MB/s eta 0:00:0000:0100:01
Installing collected packages: torch, bitsandbytes, tokenizers, accelerate, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.2
    Uninstalling tokenizers-0.21.2:
      Successfully uninstalled tokenizers-0.21.2
Successfully installed accelerate-1.9.0 bitsandbytes-0.42.0 tokenizers-0.15.2 torch-2.2.2 transformers-4.35.2
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 1.3 MB/s eta 0:00:00
Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0)
Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2)
Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4)
Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4)
Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0)
Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0)
Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3)
Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 470.2/470.2 kB 7.1 MB/s eta 0:00:00a 0:00:01
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.8/10.8 MB 10.5 MB/s eta 0:00:0000:010:01
Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl (2.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.9/2.9 MB 11.2 MB/s eta 0:00:0000:0100:01
Installing collected packages: tokenizers, transformers, sentence-transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed sentence-transformers-5.0.0 tokenizers-0.21.2 transformers-4.53.2
Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl.metadata (15 kB)
Requirement already satisfied: adjustText in /opt/anaconda3/lib/python3.11/site-packages (1.3.0)
Requirement already satisfied: numpy>=1.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.26.4)
Requirement already satisfied: pandas>=1.1.5 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (2.1.4)
Requirement already satisfied: plotly>=4.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.9.0)
Requirement already satisfied: scikit-learn>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.6.1)
Requirement already satisfied: sentence-transformers>=0.4.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.0.0)
Requirement already satisfied: tqdm>=4.41.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (4.65.0)
Requirement already satisfied: llvmlite>0.36.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (0.42.0)
Requirement already satisfied: scipy>=1.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (1.11.4)
Requirement already satisfied: numba>=0.51.2 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.59.0)
Requirement already satisfied: pynndescent>=0.5 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.5.13)
Requirement already satisfied: joblib>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from hdbscan) (1.2.0)
Requirement already satisfied: matplotlib in /opt/anaconda3/lib/python3.11/site-packages (from adjustText) (3.8.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from plotly>=4.7.0->bertopic) (8.2.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn>=1.0->bertopic) (3.5.0)
Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.53.2)
Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (2.2.2)
Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (0.33.4)
Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (10.2.0)
Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.14.0)
Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.2.0)
Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.4.4)
Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (23.1)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (3.0.9)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2023.6.0)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (6.0.1)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.32.3)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (1.1.5)
Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1.3)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (2023.10.3)
Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.21.2)
Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.5.2)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2025.4.26)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.3.0)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 153.0/153.0 kB 5.3 MB/s eta 0:00:00
Downloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 90.1/90.1 kB 8.6 MB/s eta 0:00:00
Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl (1.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 3.5 MB/s eta 0:00:00a 0:00:01m
Installing collected packages: hdbscan, umap-learn, bertopic
Successfully installed bertopic-0.17.3 hdbscan-0.8.40 umap-learn-0.5.9.post2
In [5]:
!pip install openai --upgrade
Requirement already satisfied: openai in /opt/anaconda3/lib/python3.11/site-packages (1.96.0)
Collecting openai
  Downloading openai-1.97.0-py3-none-any.whl.metadata (29 kB)
Requirement already satisfied: anyio<5,>=3.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.2.0)
Requirement already satisfied: distro<2,>=1.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.8.0)
Requirement already satisfied: httpx<1,>=0.23.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.28.1)
Requirement already satisfied: jiter<1,>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.10.0)
Requirement already satisfied: pydantic<3,>=1.9.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (2.8.2)
Requirement already satisfied: sniffio in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.3.0)
Requirement already satisfied: tqdm>4 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.65.0)
Requirement already satisfied: typing-extensions<5,>=4.11 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.14.0)
Requirement already satisfied: idna>=2.8 in /opt/anaconda3/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.10)
Requirement already satisfied: certifi in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2025.4.26)
Requirement already satisfied: httpcore==1.* in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.7)
Requirement already satisfied: h11<0.15,>=0.13 in /opt/anaconda3/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)
Requirement already satisfied: annotated-types>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (0.6.0)
Requirement already satisfied: pydantic-core==2.20.1 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (2.20.1)
Downloading openai-1.97.0-py3-none-any.whl (764 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 765.0/765.0 kB 8.8 MB/s eta 0:00:0000:0100:01
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.96.0
    Uninstalling openai-1.96.0:
      Successfully uninstalled openai-1.96.0
Successfully installed openai-1.97.0
In [8]:
!pip install polars-lts-cpu
Requirement already satisfied: polars-lts-cpu in /opt/anaconda3/lib/python3.11/site-packages (1.31.0)
In [10]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

from bertopic import BERTopic
print("BERTopic imported successfully")

from sentence_transformers import SentenceTransformer
print("SentenceTransformers is working")

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

import openai
PyTorch version: 2.2.2
CUDA available: False
<frozen importlib._bootstrap>:241: RuntimeWarning: pyarrow.lib.IpcReadOptions size changed, may indicate binary incompatibility. Expected 96 from C header, got 104 from PyObject
BERTopic imported successfully
SentenceTransformers is working
In [11]:
import pandas as pd
import re
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from bertopic import BERTopic
from bertopic.representation import OpenAI, KeyBERTInspired, MaximalMarginalRelevance

# Load API key from .env
from dotenv import load_dotenv
import os
import openai

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

print("Packages loaded successfully.")
Packages loaded successfully.
[nltk_data] Downloading package stopwords to /Users/Condi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [16]:
import openai
from dotenv import load_dotenv
import os

# Load API key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Create OpenAI client for v1.x
client = openai.OpenAI(api_key=api_key)

# Load topic representation
from bertopic.representation import OpenAI as OpenAI_Representation

representation_model = OpenAI_Representation(
    client=client,
    model="gpt-4o",
    delay_in_seconds=10
)

# Prompt
prompt = """
I have a topic that contains the following documents: [DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
In [20]:
import pandas as pd

file_list = [
    "Input/gender_Apr25-1.csv"
]

# Read and convert date without filtering
dfs = []
for f in file_list:
    df_temp = pd.read_csv(f)
    df_temp["event_date"] = pd.to_datetime(df_temp["event_date"])
    dfs.append(df_temp)

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)
In [22]:
print("Number of rows after filtering:", len(df))
print("Dates:", df["event_date"].min(), "→", df["event_date"].max())
Number of rows after filtering: 84118
Dates: 1997-01-06 00:00:00 → 2025-04-25 00:00:00
In [24]:
import polars as pl

pl_df = pl.from_pandas(df)
pl_df
Out[24]:
shape: (84_118, 31)
event_id_cntyevent_dateyeartime_precisiondisorder_typeevent_typesub_event_typeactor1assoc_actor_1inter1actor2assoc_actor_2inter2interactioncivilian_targetingisoregioncountryadmin1admin2admin3locationlatitudelongitudegeo_precisionsourcesource_scalenotesfatalitiestagstimestamp
strdatetime[ns]i64i64strstrstrstrstrstrstrstrstrstrstri64strstrstrstrstrstrf64f64i64strstrstri64stri64
"ARG16601"2025-04-25 00:00:0020251"Demonstrations""Protests""Peaceful protest""Protesters (Argentina)""Women (Argentina)""Protesters"nullnullnull"Protesters only"null32"South America""Argentina""Cordoba""Punilla"null"Capilla del Monte"-30.8568-64.52581"El Diario de Carlos Paz""Subnational""On 25 April 2025, in Capilla d…0"crowd size=large"1745881584
"BRA96908"2025-04-25 00:00:0020252"Political violence""Violence against civilians""Attack""CV: Red Command"null"Political militia""Civilians (Brazil)""Women (Brazil)""Civilians""Political militia-Civilians""Civilian targeting"76"South America""Brazil""Bahia""Salvador"null"Salvador"-12.9711-38.51081"Alo Juca; Bnews (Brazil)""Subnational-National""Around 25 April 2025 (as repor…1"women targeted: girls"1745881585
"ISR45719"2025-04-25 00:00:0020251"Demonstrations""Protests""Peaceful protest""Protesters (Israel)""Shift 101; Women (Israel)""Protesters"nullnullnull"Protesters only"null376"Middle East""Israel""Jerusalem""Jerusalem""Judean Mountains""Jerusalem"31.76935.21631"Haaretz""National""On 25 April 2025, about 200 Is…0"crowd size=about 200"1745881590
"MEX103000"2025-04-25 00:00:0020251"Political violence""Violence against civilians""Attack""Unidentified Armed Group (Mexi…null"Political militia""Civilians (Mexico)""Labor Group (Mexico); Women (M…"Civilians""Political militia-Civilians""Civilian targeting"484"North America""Mexico""Guanajuato""Leon"null"Leon de los Aldama"21.122-101.68321"Zona Franca""Subnational""On 25 April 2025, in Leon de l…1null1745881592
"MEX103223"2025-04-25 00:00:0020251"Political violence""Violence against civilians""Attack""Unidentified Gang (Mexico)"null"Political militia""Civilians (Mexico)""Women (Mexico)""Civilians""Political militia-Civilians""Civilian targeting"484"North America""Mexico""Veracruz de Ignacio de la Llav…"Coxquihui"null"Sabanas de Xalostoc"20.2216-97.53491"Imagen del Golfo""Subnational""On 25 April 2025, in Sabanas d…2"women targeted: relatives of t…1745881593
…………………………………………………………………………………
"UGA13"1997-03-05 00:00:0019972"Political violence""Violence against civilians""Attack""LRA: Lords Resistance Army"null"Rebel group""Civilians (Uganda)""Women (Uganda)""Civilians""Rebel group-Civilians""Civilian targeting"800"Eastern Africa""Uganda""Lamwo""Lamwo""Palabek Kal""Palabek"3.433332.56671"New York Times""International""After failing to find deserter…9"women targeted: girls"1667868656
"ALG50"1997-02-24 00:00:0019971"Political violence""Violence against civilians""Attack""GIA: Armed Islamic Group"null"Rebel group""Civilians (Algeria)""Women (Algeria)""Civilians""Rebel group-Civilians""Civilian targeting"12"Northern Africa""Algeria""Medea""Berrouaghia"null"Berrouaghia"36.13522.91091"Algeria Watch""Other""24 March: 5 young girls were k…5"women targeted: girls"1638981224
"SIE4762"1997-01-22 00:00:0019972"Political violence""Violence against civilians""Sexual violence""RUF: Revolutionary United Fron…null"Rebel group""Civilians (Sierra Leone)""Women (Sierra Leone)""Civilians""Rebel group-Civilians""Civilian targeting"694"Western Africa""Sierra Leone""Northern""Tonkolili""Kholifa Rowalla""Magburaka"8.7167-11.952"AFP""International""Week of 22 January. RUF forces…40"women targeted: girls"1638981224
"RWA652"1997-01-15 00:00:0019973"Political violence""Violence against civilians""Attack""Unidentified Armed Group (Rwan…null"Political militia""Civilians (Spain)""Aid Workers (Spain); Women (Sp…"Civilians""Political militia-Civilians""Civilian targeting"646"Eastern Africa""Rwanda""North""Musanze""Cyuve""Ruhengeri"-1.499829.6351"Aid Worker Security Database""Local partner-Other""Around 15 January 1997 (month …3null1633983690
"NIR1"1997-01-06 00:00:0019971"Political violence""Violence against civilians""Attack""Tuareg Ethnic Militia (Niger)"null"Identity militia""Civilians (Niger)""Women (Niger)""Civilians""Identity militia-Civilians""Civilian targeting"562"Western Africa""Niger""Niamey""Ville de Niamey""Niamey III""Niamey"13.522.121"Reuters""International""A french woman was shot and ki…1null1622068223
In [26]:
import polars as pl
import pandas as pd
import re

# Get unique list of places and create the pattern
place_cols = ["country", "location", "admin1", "admin2", "admin3"]
place_series = [pl_df[col].drop_nulls().unique() for col in place_cols]
places = set()
for s in place_series:
    places.update(s.to_list())

places = [l.strip() for l in places if isinstance(l, str)]
pattern = r'\b(' + '|'.join(map(re.escape, places)) + r')\b'

# Apply cleaning directly in Polars
pl_df_clean = pl_df.with_columns(
    pl.col("notes")
      .cast(pl.String)
      .str.replace_all(pattern, "")
      .str.replace_all(r"\b(19|20)\d{2}\b", "")
      .alias("notes_clean")
)

# Extract the final list
documents = pl_df_clean["notes_clean"].drop_nulls().to_list()
In [28]:
titles = df["event_id_cnty"].dropna().tolist()
In [30]:
import pandas as pd
import polars as pl

def glimpse_polars(df: pl.DataFrame, max_cols=100, max_rows=5):
    
    print(f"Observations: {df.height:,}")
    print(f"Variables: {df.width:,}")
    print("-" * 100)

    col_info = []
    for i, col_name in enumerate(df.columns):
        if i < max_cols:
            col_series = df.get_column(col_name)
            dtype = col_series.dtype
            non_null_count = col_series.len() - col_series.null_count()
            unique_count = col_series.n_unique()
            
            sample_values = col_series.drop_nulls().unique().slice(0, max_rows).to_list()

            col_info.append({
                "Variable": col_name,
                "Type": dtype,
                "Non-Null": f"{non_null_count:,}",
                "Unique": f"{unique_count:,}",
                "Sample Values": sample_values
            })

    col_info_df = pd.DataFrame(col_info)
    print(col_info_df.to_string(index=False, max_colwidth=100))

    if df.width > max_cols:
        print(f"\n... and {df.width - max_cols} more variables")
    print("-" * 100)

glimpse_polars(pl_df_clean)
Observations: 84,118
Variables: 32
----------------------------------------------------------------------------------------------------
          Variable                                     Type Non-Null Unique                                                                                        Sample Values
     event_id_cnty                                   String   84,118 84,118                                                      [SWE6397, ARG3600, FRA20469, CAN3752, ITA14805]
        event_date Datetime(time_unit='ns', time_zone=None)   84,118  5,331 [1997-01-06 00:00:00, 1997-01-15 00:00:00, 1997-01-22 00:00:00, 1997-02-24 00:00:00, 1997-03-05 0...
              year                                    Int64   84,118     29                                                                       [1997, 1998, 1999, 2000, 2001]
    time_precision                                    Int64   84,118      3                                                                                            [1, 2, 3]
     disorder_type                                   String   84,118      4     [Strategic developments, Political violence, Demonstrations, Political violence; Demonstrations]
        event_type                                   String   84,118      5    [Violence against civilians, Riots, Explosions/Remote violence, Strategic developments, Protests]
    sub_event_type                                   String   84,118     17 [Disrupted weapons use, Sexual violence, Protest with intervention, Remote explosive/landmine/IED...
            actor1                                   String   84,118  2,036 [Kashmir Rebels (India), Government of Kyrgyzstan (2017-2020), Rioters (Mauritius), Islamic State...
     assoc_actor_1                                   String   62,034 12,807 [The Korean Council; Buddhist Group (South Korea); Students (South Korea); Women (South Korea), A...
            inter1                                   String   84,118      8                    [Protesters, Political militia, Rebel group, External/Other forces, State forces]
            actor2                                   String   29,780    612 [Military Forces of Lebanon (2019-2020), Rioters (Guatemala), Police Forces of Guinea (2008-2010)...
     assoc_actor_2                                   String   26,441  3,837 [Government of South Korea (2022-); Liberal Unification Party; PPP: People Power Party; Protestan...
            inter2                                   String   29,780      9                                          [Protesters, State forces, Rebel group, Civilians, Rioters]
       interaction                                   String   84,118     23 [State forces-Rioters, Rioters-Protesters, State forces-Civilians, Rioters-Rioters, Identity mili...
civilian_targeting                                   String   23,658      2                                                                                 [Civilian targeting]
               iso                                    Int64   84,118    196                                                                                    [0, 4, 8, 12, 20]
            region                                   String   84,118     16                                      [Eastern Africa, Caribbean, East Asia, Middle East, South Asia]
           country                                   String   84,118    196                                         [Luxembourg, Papua New Guinea, Sint Maarten, Germany, Egypt]
            admin1                                   String   84,118  2,191                                             [Jablanica, Sint Maarten, Bogota, D.C., Imereti, Lusaka]
            admin2                                   String   82,163  9,938                                                       [Moro, Dalaho, Alcobaca, Chapulhuacan, Nueces]
            admin3                                   String   32,852  6,400                                        [Juong Kang, City of Tshwane, Bafoussam 1, Rohat, Sakinyonga]
          location                                   String   84,118 20,553                                                     [Tokyo, Guerrero Negro, Uruma, Qariyah, Colonie]
          latitude                                  Float64   84,118 20,919                                                   [-54.8062, -54.5119, -53.7865, -53.1548, -51.7308]
         longitude                                  Float64   84,118 21,104                                              [-171.7553, -161.7558, -159.7804, -159.3721, -158.4575]
     geo_precision                                    Int64   84,118      3                                                                                            [1, 2, 3]
            source                                   String   84,118 15,733 [El Tiempo (Colombia); La Opinion (Colombia), Melilla Hoy, Spectrum News Charlotte; Queen City Ne...
      source_scale                                   String   84,118     26 [Local partner-International, New media-National, Subnational, Local partner-National, Subnationa...
             notes                                   String   84,118 83,631 [On 8 March 2021, over 100 citizens called by the Feminist Action (AFdA) protested in Seu d'Urgel...
        fatalities                                    Int64   84,118     55                                                                                      [0, 1, 2, 3, 4]
              tags                                   String   63,623  2,702 [crowd size=an estimated 200, crowd size=2,000-15,000, crowd size=around 300; local administrator...
         timestamp                                    Int64   84,118 16,192                                         [1559160369, 1559160524, 1559160525, 1559160526, 1559160527]
       notes_clean                                   String   84,118 78,463 [On 14 May , two men approached a same-sex couple, used anti-LGBTQ+ slur, punched one of the wome...
----------------------------------------------------------------------------------------------------
In [32]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI

# Embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# UMAP for dimensionality reduction (5D for clustering)
umap_model = UMAP(
    n_neighbors=40,
    n_components=5,
    min_dist=0.4,
    metric='cosine',
    random_state=42
)

# HDBSCAN for clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=40,
    min_samples=15,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# Dimensionality reduction for visualization (2D)
reduced_embeddings = UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42
).fit_transform(embeddings)

# Stopwords and vectorization
stopwords_total = set(stopwords.words("spanish") + stopwords.words("english"))
vectorizer_model = CountVectorizer(stop_words=list(stopwords_total))

# Semantic representations
representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3),
    "OpenAI": OpenAI(
        client=client,
        model="gpt-4o",
        prompt=prompt
    )
}
Batches:   0%|          | 0/2629 [00:00<?, ?it/s]
In [34]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    top_n_words=10,
    verbose=True
)

# Fit the model and transform the documents
topics, probs = topic_model.fit_transform(documents, embeddings)
2025-07-16 14:45:57,349 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-16 14:47:41,683 - BERTopic - Dimensionality - Completed ✓
2025-07-16 14:47:41,684 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-07-16 14:47:46,087 - BERTopic - Cluster - Completed ✓
2025-07-16 14:47:46,098 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 266/266 [03:38<00:00,  1.22it/s]
2025-07-16 14:51:56,562 - BERTopic - Representation - Completed ✓
In [38]:
# Basic topic information
print(topic_model.get_topic_info())

# Keywords of a specific topic
print(topic_model.get_topic(0))

# Visualization
topic_model.visualize_topics()
     Topic  Count                                       Name  \
0       -1  40496               -1_women_group_protest_march   
1        0   1364              0_found_body_wrapped_fatality   
2        1   1152   1_ransom_abducted_kidnapped_unidentified   
3        2   1121          2_conference_press_picketed_urged   
4        3   1006             3_femicide_justice_case_demand   
..     ...    ...                                        ...   
261    260     40   260_conversions_aurat_marriages_enforced   
262    261     40      261_japanese_weekly_apology_wednesday   
263    262     40            262_bigger_50th_roe_anniversary   
264    263     40  263_autism_children_simeonov_disabilities   
265    264     40    264_hijab_religious_sociopolitical_veil   

                                        Representation  \
0    [women, group, protest, march, members, woman,...   
1    [found, body, wrapped, fatality, colonia, tied...   
2    [ransom, abducted, kidnapped, unidentified, ar...   
3    [conference, press, picketed, urged, korean, s...   
4    [femicide, justice, case, demand, friends, fem...   
..                                                 ...   
261  [conversions, aurat, marriages, enforced, mini...   
262  [japanese, weekly, apology, wednesday, comfort...   
263  [bigger, 50th, roe, anniversary, coincide, wad...   
264  [autism, children, simeonov, disabilities, aut...   
265  [hijab, religious, sociopolitical, veil, guida...   

                                               KeyBERT  \
0    [protested, protesters, protest, demonstrators...   
1    [strangled, decapitated, dismembered, beheaded...   
2    [kidnapped, kidnappers, kidnapping, abductions...   
3    [picketed, pickets, solidarity, discrimination...   
4    [protested, protesters, protest, victims, acti...   
..                                                 ...   
261  [protest, march, wage, wages, demonstration, w...   
262  [reconciliation, seoul, participants, embassy,...   
263  [protesters, protests, demonstrators, abortion...   
264  [protested, disabilities, mothers, children, a...   
265  [hijabs, hijab, niqabs, sharia, islamic, polic...   

                                                   MMR  \
0    [group, protest, march, woman, violence, polic...   
1    [found, wrapped, fatality, colonia, plastic, t...   
2    [ransom, abducted, kidnapped, unidentified, na...   
3    [press, picketed, seoul, banners, harassment, ...   
4    [justice, cases, victims, feminist, victim, co...   
..                                                 ...   
261  [marriages, enforced, increase, march, demandi...   
262  [japanese, apology, wednesday, comfort, embass...   
263  [roe, january, commemorate, 51st, abortion, pr...   
264  [autism, disabilities, valeri, insulting, parl...   
265  [hijab, sociopolitical, veil, islamic, iranian...   

                                                OpenAI  \
0    [Protests Against Gender-Based Violence and Wo...   
1    [Femicide and Body Disposal Patterns in Urban ...   
2     [Kidnappings and Ransom Demands by Armed Groups]   
3    [Gender Equality and Anti-Discrimination Prote...   
4    [Protests Demanding Justice for Femicide Victims]   
..                                                 ...   
261    [Women's Rights and Demonstrations in Pakistan]   
262  [Comfort Women Protests at Former Japanese Emb...   
263  [Nationwide 'Bigger than Roe' Protests on Roe ...   
264  [Protests Against Valeri Simeonov over Insulti...   
265  [Enforcement of Religious Dress Codes by Irani...   

                                   Representative_Docs  
0    [On 3 July , community members marched in  (, ...  
1    [Around 3 April  (as reported), in , , a woman...  
2    [On 5 March , unidentified armed men abducted ...  
3    [On 4 September , protesters, including local ...  
4    [On 10 January , in , , about 100 people, incl...  
..                                                 ...  
261  [On 8 March , women held a demonstration durin...  
262  [On 7 February , participants held signs and b...  
263  [On 22 January , an unreported number of peopl...  
264  [On 28 November , mothers of children with dis...  
265  [Other: On 18 June , Iranian Guidance Patrol p...  

[266 rows x 8 columns]
[('found', 0.06839050266583661), ('body', 0.0683816689112666), ('wrapped', 0.04236840290668702), ('fatality', 0.04149472584708486), ('colonia', 0.038043925667171755), ('tied', 0.03600635930266989), ('reported', 0.03454916678900737), ('plastic', 0.032110858453526715), ('signs', 0.028939472810125377), ('torture', 0.027406376602467974)]
In [40]:
topic_model.get_topic_info()
Out[40]:
Topic Count Name Representation KeyBERT MMR OpenAI Representative_Docs
0 -1 40496 -1_women_group_protest_march [women, group, protest, march, members, woman,... [protested, protesters, protest, demonstrators... [group, protest, march, woman, violence, polic... [Protests Against Gender-Based Violence and Wo... [On 3 July , community members marched in (, ...
1 0 1364 0_found_body_wrapped_fatality [found, body, wrapped, fatality, colonia, tied... [strangled, decapitated, dismembered, beheaded... [found, wrapped, fatality, colonia, plastic, t... [Femicide and Body Disposal Patterns in Urban ... [Around 3 April (as reported), in , , a woman...
2 1 1152 1_ransom_abducted_kidnapped_unidentified [ransom, abducted, kidnapped, unidentified, ar... [kidnapped, kidnappers, kidnapping, abductions... [ransom, abducted, kidnapped, unidentified, na... [Kidnappings and Ransom Demands by Armed Groups] [On 5 March , unidentified armed men abducted ...
3 2 1121 2_conference_press_picketed_urged [conference, press, picketed, urged, korean, s... [picketed, pickets, solidarity, discrimination... [press, picketed, seoul, banners, harassment, ... [Gender Equality and Anti-Discrimination Prote... [On 4 September , protesters, including local ...
4 3 1006 3_femicide_justice_case_demand [femicide, justice, case, demand, friends, fem... [protested, protesters, protest, victims, acti... [justice, cases, victims, feminist, victim, co... [Protests Demanding Justice for Femicide Victims] [On 10 January , in , , about 100 people, incl...
... ... ... ... ... ... ... ... ...
261 260 40 260_conversions_aurat_marriages_enforced [conversions, aurat, marriages, enforced, mini... [protest, march, wage, wages, demonstration, w... [marriages, enforced, increase, march, demandi... [Women's Rights and Demonstrations in Pakistan] [On 8 March , women held a demonstration durin...
262 261 40 261_japanese_weekly_apology_wednesday [japanese, weekly, apology, wednesday, comfort... [reconciliation, seoul, participants, embassy,... [japanese, apology, wednesday, comfort, embass... [Comfort Women Protests at Former Japanese Emb... [On 7 February , participants held signs and b...
263 262 40 262_bigger_50th_roe_anniversary [bigger, 50th, roe, anniversary, coincide, wad... [protesters, protests, demonstrators, abortion... [roe, january, commemorate, 51st, abortion, pr... [Nationwide 'Bigger than Roe' Protests on Roe ... [On 22 January , an unreported number of peopl...
264 263 40 263_autism_children_simeonov_disabilities [autism, children, simeonov, disabilities, aut... [protested, disabilities, mothers, children, a... [autism, disabilities, valeri, insulting, parl... [Protests Against Valeri Simeonov over Insulti... [On 28 November , mothers of children with dis...
265 264 40 264_hijab_religious_sociopolitical_veil [hijab, religious, sociopolitical, veil, guida... [hijabs, hijab, niqabs, sharia, islamic, polic... [hijab, sociopolitical, veil, islamic, iranian... [Enforcement of Religious Dress Codes by Irani... [Other: On 18 June , Iranian Guidance Patrol p...

266 rows × 8 columns

In [42]:
openai_topics = topic_model.get_topics(full=True)["OpenAI"]

for topic_id, label_info in openai_topics.items():
    label = label_info[0][0].split("\n")[0]
    print(f"Topic {topic_id:>2}: {label}")
Topic -1: Protests Against Gender-Based Violence and Women's Rights Advocacy
Topic  0: Femicide and Body Disposal Patterns in Urban Areas
Topic  1: Kidnappings and Ransom Demands by Armed Groups
Topic  2: Gender Equality and Anti-Discrimination Protests in South Korea
Topic  3: Protests Demanding Justice for Femicide Victims
Topic  4: Protests and Riots Over Mahsa Amini's Death
Topic  5: Abortion Rights Protests Following Overturning of Roe v. Wade
Topic  6: Anganwadi Workers' Protests for Regularization and Increased Wages
Topic  7: Global Demonstrations for Abortion Legalization on September 28
Topic  8: Nationwide Flower Demonstrations in Support of MeToo Movement Against Sexual Violence
Topic  9: Suppression of Petitioners and Rights Defenders During Political Conventions in China
Topic 10: Political Protests by BJP and BJP Mahila Morcha Against INC and AIMC Actions
Topic 11: Sexual Violence in Darfur by Militias and RSF
Topic 12: Drive-by Shootings Involving Motorcycles and Female Victims
Topic 13: Abductions by ISWAP/Boko Haram Militants
Topic 14: Military Arrests and Detentions in Various Regions
Topic 15: Meira Paibi Protests Over Arrest of UNLF-P Cadres Amidst Meitei-Tribal Tensions
Topic 16: Women's Rights and Anti-Violence Activism by Kvinnostrejk Movement
Topic 17: Female Student Protests and College Infrastructure Issues
Topic 18: Protests Against Mahsa Amini's Death and Iranian Government
Topic 19: Gun Violence Awareness and Advocacy Activities by Moms Demand Action
Topic 20: Sexual Violence by Military Forces
Topic 21: Comfort Women Protests and Japanese Government Apology
Topic 22: Witchcraft-Related Violence and Accusations
Topic 23: International Day for the Elimination of Violence against Women Protests
Topic 24: Violence Against Transgender Individuals and Hate Crimes
Topic 25: Asha Workers Protests for Wage Increase and Regularization in J&K
Topic 26: Nationwide Protests Against Transphobia and Senate Bill
Topic 27: University Student Protests for Justice for Mahsa Amini
Topic 28: Water Supply Protests in J&K
Topic 29: Nationwide Protests Against Trump Administration's Policies and Support for Civil Rights
Topic 30: Women's Protests Against Military Coup in Support of Civil Disobedience Movement in [Region]
Topic 31: Political Violence in West Bengal Elections
Topic 32: Protests Against Far-Right Ideologies and AfD Activities
Topic 33: Taliban Violations Against Women and Girls
Topic 34: Police Violence Against Women
Topic 35: Ambazonian Separatists' Violent Acts Against Civilians and Alleged Collaborators
Topic 36: Mothers' Day Protest for Missing Persons Justice
Topic 37: Protests Against Constitutional Revision of Article 9 in Japan
Topic 38: Violent Incidents Involving Armed Individuals Targeting Women in Michoacan
Topic 39: Black Lives Matter Protests and Advocacy Against Police Brutality
Topic 40: International Women's Day March for Gender Equality and Rights
Topic 41: Protests Against Taliban's Women's Rights Violations
Topic 42: Houthi-Sponsored Protests Supporting Palestinians Against Israeli Actions
Topic 43: Protests Against Isolation Policy of Abdullah Ocalan by Kurdish Groups
Topic 44: Kazakh Female Protests for Subsidized Housing and Social Assistance
Topic 45: Houthi Sniper Attacks on Civilians in Yemen
Topic 46: Global Climate Activism Protests by Extinction Rebellion
Topic 47: Lady Health Workers' Salary Protests
Topic 48: Israeli Settler Attacks on Palestinians in West Bank
Topic 49: Pro-Choice Demonstrations Following Overturn of Roe v. Wade
Topic 50: Red Dress Day Awareness for Missing and Murdered Indigenous Women
Topic 51: Drug Trafficking-Related Shootings of Women
Topic 52: Protests Against Gender-Based Violence in Universities
Topic 53: Women's Protests Against Relocation of Liquor Shops
Topic 54: Pro-Palestinian Demonstrations Amid Israel-Hamas Conflict
Topic 55: Sexual Misconduct by Police Officers
Topic 56: Women's Rights Activism and Demonstrations
Topic 57: Motorcycle Drive-by Shootings Targeting Women
Topic 58: Imbonerakure Violence against Women and Minors
Topic 59: Protests Against Gender-Based Violence in November and December
Topic 60: Protests Against Citizenship Amendment Act and National Register of Citizens
Topic 61: Women's March 2020
Topic 62: International Women's Day Protests for Gender Equality
Topic 63: Killings by Unidentified Armed Groups in Kivu and Surrounding Regions
Topic 64: Female Homicides Involving Burning of Bodies
Topic 65: Ukrainian Protests Against Russian Invasion
Topic 66: Women's Rights and Abortion Protests
Topic 67: Military Attacks and Arson in Villages
Topic 68: Protests Against Sexual Harassment in Wrestling Federation
Topic 69: Parent Protests in Elementary Schools Over Administration Issues and Resource Management
Topic 70: Political Candidate Attacks and Violence
Topic 71: Wartime Sexual Violence and Killings Involving Russian and Ukrainian Forces
Topic 72: Hijab Protests and Counter-Protests in Educational Institutions
Topic 73: Saturday Mothers' Weekly Protests for Justice
Topic 74: Women's Protests and Public Vandalism on International Women's Day
Topic 75: Abortion Rights Protests and Counter-Demonstrations
Topic 76: Mob Violence Against Women Suspected as Child Lifters
Topic 77: Topic: Fulani-Related Violence and Attacks on Civilians
Topic 78: Off-Duty Female Police Officer Fatalities by Armed Attackers
Topic 79: HDP Protests Against Dismissal of Mayors
Topic 80: Protests Against Dismissal of DEM Party Mayor Mehmet Siddik Akis and Trustee Appointment
Topic 81: Armed Violence and Fatalities Involving FARC Dissidents and ELN in Rural Areas
Topic 82: Protests by Female Street Traders Against Market Expulsions and Relocations
Topic 83: Inter-Community Violence and Protest in Meitei and Tribal Areas
Topic 84: Caste-Based Sexual Violence Against Dalit Women in Uttar Pradesh
Topic 85: Abortion Rights Protests Following Leaked Supreme Court Draft
Topic 86: International Protest Against Gender-Based Violence
Topic 87: Nationwide Flower Demonstrations Against Sexual Violence Acquittals
Topic 88: Protests Against Netanyahu's Judicial Overhaul
Topic 89: International Women's Day Demonstrations and Women's Issues
Topic 90: Women's Protests Against Energy Shortages in KPK
Topic 91: Protests by Swedish-Iranian Community against Death of Kurdish Woman in Police Custody
Topic 92: Student Protests Against School Handling of Sexual Assault and Misconduct
Topic 93: Protests Against Rape and Murder of Woman Veterinary Doctor
Topic 94: Protests Against Supreme Court's Dobbs Decision on Abortion Rights
Topic 95: Protests Against Violence Towards Women
Topic 96: Protests Against Macron's Appointment of Michel Barnier as Prime Minister
Topic 97: Protests Against Violence and Rape in Bangladesh
Topic 98: Gang-Related Violence Against Women
Topic 99: Female Textile Workers' Protests for Unpaid Wages
Topic 100: Women's March Protesting Amy Coney Barrett's Supreme Court Nomination
Topic 101: Women's Strike Movement Protesting Abortion Law Restrictions
Topic 102: Women's Protests for Political Prisoners' Immediate Release in Bahrain
Topic 103: Military Police Officers' Wives Protesting Overdue Salaries
Topic 104: Violence and Human Rights Abuses by Mayi-Mayi Militias in Kivu Region
Topic 105: Women's Rights Demonstrations in Dalarna
Topic 106: University Fraternity Sexual Assault Protests
Topic 107: Women's Rights Activists Protest for Hostage Release from Hamas
Topic 108: Protests Against Agricultural Ordinances and Debt Waivers for Farmers
Topic 109: Protests Against Turkish Military Operations in Kurdish Regions
Topic 110: Boeung Kak and Borei Keila Land Disputes and Protests in Cambodia
Topic 111: Women's Rally Against Abortion Restrictions
Topic 112: Supreme Court and Abortion Legislation Protests
Topic 113: Alleged Poisoning and Mass Sociogenic Illness in Schools
Topic 114: Women's Protests Against Abortion Restrictions
Topic 115: Protests Against Bolsonaro's Presidency and Gender-Based Violence in Brazil
Topic 116: Turkish Withdrawal from Women's Rights Convention Protest
Topic 117: Fuel Price Hikes and Socioeconomic Protests
Topic 118: Al Shabaab Attacks on Civilians and Officials
Topic 119: Supreme Court Draft Opinion and Roe v. Wade Protests
Topic 120: Vigilante Killings of Female Drug Suspects
Topic 121: Breast Cancer Awareness and Advocacy
Topic 122: Take Back the Night Marches Against Gender-Based Violence
Topic 123: Women's Protests Against Government Withdrawal from Domestic Violence Convention
Topic 124: Sexual Violence by Armed Groups
Topic 125: Saturday Mothers' Protests for Detainee Justice
Topic 126: Houthi-Sponsored Demonstrations Against Israeli and Western Actions
Topic 127: Abortion Rights Protests in Response to Leaked Supreme Court Draft
Topic 128: Vanessa Guillen Vigil and Justice Protests
Topic 129: Election Protests and Allegations of Fraud
Topic 130: Student Protests for Justice and Police Reform
Topic 131: Protests for Health Access for Severely Ill Prisoners
Topic 132: Political Protests by Women Wings Against Arrests in Pakistan
Topic 133: Protests Against Islamic Government's Hijab Policy
Topic 134: International Women's Day Protests for Gender Equality and Labor Rights
Topic 135: Protests Against Insecurity and Fulani Pastoralists in Akoko Region
Topic 136: Nationwide Flower Demonstrations Against Sexual Violence Acquittals
Topic 137: Ethnic Conflict and Violence between Murle and Lou Nuer Communities
Topic 138: Women's Activist Rally Against Abortion Restrictions
Topic 139: PKK-affiliated Youth Kidnappings for Conscription in Rural Areas
Topic 140: Protests by Families and Associations for the Release of Political Detainees and Disappeared Individuals
Topic 141: Detention of Women by QSD Forces in Rural Areas for Unknown Reasons
Topic 142: ADF Rebel Attacks on Civilians in Villages
Topic 143: Pro-Choice Demonstrations Responding to Supreme Court Draft Leak
Topic 144: Civil Society Protests Against Gender-Based Violence in June
Topic 145: Targeted Arson and Vandalism Against Political Figures' Properties
Topic 146: Government Crackdown on Ladies in White Activism
Topic 147: Protests Against Violence in Arab Israeli Communities
Topic 148: International Women's Day Protests Against Gender Violence and Inequality
Topic 149: International Women's Day Demonstrations for Equal Pay and Women's Rights
Topic 150: International Day for the Elimination of Violence Against Women Protests
Topic 151: NDWO Protests Against Inflation and Corruption Across 77 Districts
Topic 152: International Women's Day Protests Against Femicides and for Women's Rights
Topic 153: Mahsa Amini Protests and Mourning Observance in Iran
Topic 154: Korean Farmers' Advocacy and Press Conferences
Topic 155: Women's Protests Against HTS in Countryside Towns
Topic 156: Nepali Congress Anti-Government Demonstrations
Topic 157: Unsolved Home Invasion Murders of Women
Topic 158: Opposition to Legalisation of Alcohol in the State through Women's Sit-in Demonstrations
Topic 159: Iranian Women's Rights Protests Against Hijab Enforcement and Police Brutality
Topic 160: Kurdish Newroz Celebrations and Demands for Abdullah Ocalan's Release
Topic 161: Prison Violence Against Female Political Prisoners
Topic 162: Protests Against Quran Burning by Far-Right Politician
Topic 163: Baloch Sunni Protests Against Government Violence
Topic 164: Dahalo Militia Abductions and Violence in Madagascar
Topic 165: Houthi-Sponsored Protest in Solidarity with Palestinians Against Zionist Actions
Topic 166: Protests for Justice and Prevention of Child Murders
Topic 167: Advocacy for Prostitution Law Amendment
Topic 168: Labor Rights Protest by Domestic and Care Workers in the Basque Region
Topic 169: Student Protests Against Leaked Supreme Court Draft on Abortion Rights
Topic 170: Protests Against Gender-Based Violence on the International Day for the Elimination of Violence Against Women
Topic 171: Women's Rally Against Abortion Restrictions
Topic 172: International Day for the Elimination of Violence Against Women Demonstrations
Topic 173: Alleged Poisonings and Mass Hysteria in Schools
Topic 174: National Strike Against Austerity and for Wage Increases
Topic 175: Anti-Nuclear Power Plant Protests and Advocacy
Topic 176: Protests Against Rising Fuel Prices by Congress Women's Wing
Topic 177: International Women's Day Demonstrations on Women's Rights and Inequality Issues
Topic 178: International Women's Day Protests Against Femicides
Topic 179: WOZA Educational Protests
Topic 180: National Protests Against Austerity and For Wage Increases
Topic 181: Women's Strike Movement Abortion Protests
Topic 182: Sexual Harassment Allegations and Protests
Topic 183: Women's Demand for Gender Equality in the Catholic Church
Topic 184: Alleged Poisoning of Female Students and Protests Against Government Involvement
Topic 185: RLD State-Wide Protests Against Communal Violence and Demand for President's Rule
Topic 186: Women's Strike Movement Protests Against Abortion Restrictions
Topic 187: Protests Against Macron's Support for Gerard Depardieu Amid Rape Accusations
Topic 188: Anti-Far-Right Protests in France Following European Elections
Topic 189: Protests Over Mahsa Amini's Death in Police Custody
Topic 190: Advocacy for Comprehensive Laws Against Sexual Violence
Topic 191: One Billion Rising Protests Against Violence on February 14
Topic 192: Taliban Media Restrictions and Gender-Based Bans
Topic 193: Justice Protests for Nirmala Pant's Murder and Rape Case
Topic 194: Mob Justice and Lynching of Women Accused of Crimes
Topic 195: Prison Visit Suspensions and Protests
Topic 196: Protest Against Attack on Jatiya Parishad State President
Topic 197: Sexual Violence by SPLA and SPLA-IO Forces in Conflict Zones
Topic 198: Meitei Community Protests and Government Response
Topic 199: International Women's Day Protests for Gender Equality and Against Gender-Based Violence
Topic 200: Gang Violence and Sexual Assault Amid G-9 and G-Pep La Clashes
Topic 201: Protest Against Gender-Based Violence Following Children's Disappearance
Topic 202: International Women's Day Protest for Gender Equality
Topic 203: CODECO-URDPC Attacks on Civilians and Looting Activities
Topic 204: Women's Strike Rally Against Abortion Restrictions
Topic 205: Houthi-Sponsored Protests in Solidarity with Palestine and Opposition to Israel
Topic 206: Houthi-Sponsored Protest Commemorating Saleh Ali Al Samad and Supporting Palestinian Solidarity
Topic 207: Detainment and Re-education of Uyghur and Kazakh Women in China
Topic 208: Women's Strike Protests Against Abortion Restrictions
Topic 209: Protests Over Rape-Murder of PGT Doctor at RG Kar Medical College
Topic 210: Advocacy for Missing Persons Detained by the State
Topic 211: Women's Rights and Abortion Protests
Topic 212: International Women's Day Demonstrations for Equal Rights and Pay
Topic 213: Protests Over Benazir Income Support Programme (BISP) Issues
Topic 214: Protest Against Agriculture Minister's Derogatory Remarks by BJP Mahila Morcha
Topic 215: Environmental and Economic Opposition to Iron Sand Mining
Topic 216: Demonstration Against Violence on Women Before International Day
Topic 217: Political Violence and Intimidation in Zimbabwean Party Conflicts
Topic 218: Protests Commemorating EDSA People Power Revolution and Opposing Current Philippine Leadership
Topic 219: Protests Against Violence on International Day for the Elimination of Violence Against Women
Topic 220: Women's Protest Against Government Apathy Amid COVID-19 Migrant Crisis
Topic 221: Meira Paibi Protests Against NIA Case on Arambai Tenggol Amidst Meitei-Tribal Violence
Topic 222: Student Protests Against Rape and Violence in Country on February 23
Topic 223: Female Health Workers' Protest for Pending Salaries
Topic 224: Maternity Ward Closures and Protests for Reopening Emergency Services
Topic 225: Sudanese Demonstrations Against Military Coup and Arrests
Topic 226: Abortion Rights and Religious Protest Dynamics
Topic 227: Protests Against AFSPA and Arrest of PREPAK Members in Manipur
Topic 228: Advocacy for Voting Rights and Commemoration of January 6th Capitol Riots Anniversary
Topic 229: Student Protests Against Government Over Death in Police Custody
Topic 230: Arrest and Detention of Women in Syrian and Turkish Countryside for Unknown Reasons
Topic 231: Houthi Protests and Zaydi Shiite Commemoration Activities
Topic 232: Political Protests and Impeachment of President Yoon Suk-yeol
Topic 233: Moroccan Protests Against Israeli Operations and Normalization
Topic 234: Drug Trafficking and Torture Incidents
Topic 235: Houthi-Sponsored Protest and Solidarity with Palestinians and Lebanese on 10th Anniversary
Topic 236: Meitei Community's Demand for Scheduled Tribe Status
Topic 237: Women's Protests and Political Demands in Artsakh and Belarus
Topic 238: Women's Protests Against Microfinance Loan Policies
Topic 239: Protests Against Indian Government's Actions in Kashmir
Topic 240: Protests Against Government Pressure on Independent Media
Topic 241: Protests Against Rising Violence and Rape in District
Topic 242: Medically Assisted Procreation Law Protests and Counter-Protests
Topic 243: Protests for Drug Haul Case Reinvestigation
Topic 244: Violence by Presumed Muslim Separatists Against Female Civilians in Thai Villages
Topic 245: Protests Against Polish Constitutional Court's Abortion Ban
Topic 246: Anti-Nuclear Protests and Peace Wave Movement Commemorating Atomic Bombings
Topic 247: Women's Rights Demonstrations and Police Intervention
Topic 248: Houthi-Sponsored Protest Commemorating Tanumah Massacre and Solidarity with Palestine
Topic 249: Midwives' Protest Against Law Affecting Job Security
Topic 250: Indigenous Rights Protest for Missing Women at Landfills
Topic 251: Women's Rights and Abortion Legalization Protests on International Women's Day
Topic 252: Women's Protest Against Abortion Restrictions
Topic 253: Women's Rights Protests and 'A Rapist in Your Path' Performances
Topic 254: Anti-War Protests Organized by Codepink
Topic 255: Military Service Deaths Protest Movement
Topic 256: Protests Against Modesty Police Following Death of Young Woman
Topic 257: Protests against Morality Police after Mahsa Amini's Death
Topic 258: Demonstrations Supporting Rule of Law Against Far-Right Leader's Conviction
Topic 259: Nationwide Farmers' Protest Against Farm Laws
Topic 260: Women's Rights and Demonstrations in Pakistan
Topic 261: Comfort Women Protests at Former Japanese Embassy in Seoul
Topic 262: Nationwide 'Bigger than Roe' Protests on Roe v. Wade 50th Anniversary
Topic 263: Protests Against Valeri Simeonov over Insulting Remarks to Mothers of Children with Disabilities
Topic 264: Enforcement of Religious Dress Codes by Iranian Morality Police
In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert only necessary columns to pandas, preserving order
df_plot = pl_df_clean.select(["country", "notes_clean"]).drop_nulls().to_pandas()

# Ensure the length matches the number of topics
assert len(df_plot) == len(topics), "❌ 'topics' length does not match the cleaned documents."

# Assign topics to cleaned documents
df_doc_topics = pd.DataFrame({
    "country": df_plot["country"].values,
    "topic": topics
})

# Count topics per country
topic_counts = df_doc_topics.groupby(["country", "topic"]).size().reset_index(name="count")

# Relative percentage per country
topic_counts["total_country"] = topic_counts.groupby("country")["count"].transform("sum")
topic_counts["percentage"] = 100 * topic_counts["count"] / topic_counts["total_country"]

# Get topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
    {"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
    for topic_id, label_info in openai_topics.items()
])

# Merge with labels
topic_counts["topic"] = topic_counts["topic"].astype(int)
labels_df["topic"] = labels_df["topic"].astype(int)
topic_counts = topic_counts.merge(labels_df, on="topic", how="left")

# Top-N topics per country
top_n = 5
top_topics_per_country = topic_counts.sort_values(["country", "percentage"], ascending=[True, False])\
                                     .groupby("country").head(top_n)

# Limit to only 3 countries
countries_to_plot = top_topics_per_country["country"].drop_duplicates().sort_values().head(6)
top_topics_per_country = top_topics_per_country[top_topics_per_country["country"].isin(countries_to_plot)]

# Plot with Seaborn
g = sns.FacetGrid(
    top_topics_per_country,
    col="country",
    col_wrap=3,
    sharey=False,
    height=4,
    aspect=1.5
)

g.map_dataframe(
    sns.barplot,
    x="percentage",
    y="label",
    palette="tab10"
)

g.set_titles(col_template="{col_name}")
g.set_axis_labels("Percentage (%)", "Topic")
for ax in g.axes.flatten():
    for label in ax.get_yticklabels():
        label.set_rotation(0)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [46]:
openai_labels = [
    label[0][0].split("\n")[0]
    for label in topic_model.get_topics(full=True)["OpenAI"].values()
]

topic_model.set_topic_labels(openai_labels)

topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert required columns from Polars
df_plot = pl_df_clean.select(["event_date", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])

# Validate topic length
assert len(df_plot) == len(topics), "❌ Length of 'topics' does not match cleaned documents."

# Create DataFrame of topics per document
df_doc_topics = pd.DataFrame({
    "event_date": df_plot["event_date"].values,
    "topic": topics
})

# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()

# Group by month and topic
monthly_topic_counts = df_doc_topics.groupby(["month", "topic"]).size().reset_index(name="count")

# Calculate percentage within each month
monthly_topic_counts["monthly_total"] = monthly_topic_counts.groupby("month")["count"].transform("sum")
monthly_topic_counts["percentage"] = 100 * monthly_topic_counts["count"] / monthly_topic_counts["monthly_total"]

# Get topic labels from the model
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
    {"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
    for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
monthly_topic_counts["topic"] = monthly_topic_counts["topic"].astype(int)
monthly_topic_counts = monthly_topic_counts.merge(labels_df, on="topic", how="left")

# Select top-N global topics by volume
top_n = 5
top_topics = (
    monthly_topic_counts.groupby("topic")["count"]
    .sum()
    .nlargest(top_n)
    .index.tolist()
)
df_top = monthly_topic_counts[monthly_topic_counts["topic"].isin(top_topics)]

# Monthly percentage line plot
plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=df_top, x="month", y="percentage", hue="label", marker="o")

plt.legend(title="Topic (OpenAI)", bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0)
plt.title(f"Monthly trend (% of total) of the top {top_n} most frequent topics")
plt.xlabel("Month")
plt.ylabel("Percentage of documents (%)")
plt.xticks(rotation=45)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()
No description has been provided for this image
In [50]:
import itertools
import pandas as pd

# Define colors for the visualization to iterate over
colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}

# Prepare dataframe and ignore outliers
dfo = pd.DataFrame({"x": reduced_embeddings[:, 0], "y": reduced_embeddings[:, 1], "Topic": [str(t) for t in topic_model.topics_]})
dfo["Length"] = [len(doc) for doc in documents]
dfo = dfo.loc[dfo.Topic != "-1"]
dfo = dfo.loc[(dfo.y > -10) & (dfo.y < 10) & (dfo.x < 10) & (dfo.x > -10), :]
dfo["Topic"] = dfo["Topic"].astype("category")

# Get centroids of clusters
mean_df = dfo.groupby("Topic").mean().reset_index()
mean_df.Topic = mean_df.Topic.astype(int)
mean_df = mean_df.sort_values("Topic")
In [52]:
import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
import matplotlib.patheffects as pe
import textwrap

fig = plt.figure(figsize=(20, 20))
sns.scatterplot(
    data=dfo,
    x='x',
    y='y',
    hue='Topic',
    palette=color_key,
    alpha=0.4,
    size='Length',
    sizes=(10, 200),
    legend=False
)

# Annotate top 50 topics
texts, xs, ys = [], [], []
for row in mean_df.iterrows():
  topic = row[1]["Topic"]
  name = textwrap.fill(topic_model.custom_labels_[int(topic)], 20)
  if int(topic) <= 50:
    xs.append(row[1]["x"])
    ys.append(row[1]["y"])
    texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", color=color_key[str(int(topic))],
                          path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]
                          ))

# Adjust annotations such that they do not overlap
adjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))
plt.axis('off')
plt.legend('', frameon=False)
plt.show()
No description has been provided for this image
In [62]:
# Extract required data
df_plot = pl_df_clean.select(["event_date", "country", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])

# Validate length
assert len(df_plot) == len(topics), "❌ Length of topics does not match the documents."

# Create base DataFrame
df_doc_topics = pd.DataFrame({
    "event_date": df_plot["event_date"].values,
    "country": df_plot["country"].values,
    "topic": topics
})

# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()

# Group by month, country, and topic
grouped = df_doc_topics.groupby(["month", "country", "topic"]).size().reset_index(name="count")

# Calculate % per month and country
grouped["total"] = grouped.groupby(["month", "country"])["count"].transform("sum")
grouped["percentage"] = 100 * grouped["count"] / grouped["total"]

# Add OpenAI topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
    {"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
    for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
grouped["topic"] = grouped["topic"].astype(int)
grouped = grouped.merge(labels_df, on="topic", how="left")

# Filter top-N agendas per country and month
top_n = 3
top_agendas = grouped.sort_values(["month", "country", "percentage"], ascending=[True, True, False])\
                     .groupby(["month", "country"]).head(top_n)

top_agendas.head()
Out[62]:
month country topic count total percentage label
0 1997-01-01 Niger -1 1 1 100.0 Protests Against Gender-Based Violence and Wom...
1 1997-01-01 Rwanda -1 1 1 100.0 Protests Against Gender-Based Violence and Wom...
2 1997-01-01 Sierra Leone -1 1 1 100.0 Protests Against Gender-Based Violence and Wom...
3 1997-02-01 Algeria -1 1 1 100.0 Protests Against Gender-Based Violence and Wom...
4 1997-03-01 Uganda -1 1 1 100.0 Protests Against Gender-Based Violence and Wom...
In [66]:
top_agendas.to_csv("Output/Trained_Agendas.csv", index=False)
In [68]:
# Monthly Classification 
In [70]:
!pip install sentence-transformers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Requirement already satisfied: sentence-transformers in /opt/anaconda3/lib/python3.11/site-packages (5.0.0)
Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.53.2)
Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0)
Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2)
Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4)
Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4)
Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0)
Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1)
Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0)
Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1)
Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5)
Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)
Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1)
Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3)
Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4)
Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3)
Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.21.2)
Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26)
Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)
In [84]:
import pandas as pd

# Load the original file
df_women = pd.read_csv("Input/Jun25.csv")

# Filter rows where 'assoc_actor_1' mentions "Women"
df_filtered = df_women[df_women["assoc_actor_1"].astype(str).str.contains("Women", case=False, na=False)].copy()

# Show the first 5 results
df_filtered.head()
Out[84]:
event_id_cnty event_date year time_precision disorder_type event_type sub_event_type actor1 assoc_actor_1 inter1 ... location latitude longitude geo_precision source source_scale notes fatalities tags timestamp
36 MAA2233 30 June 2025 2025 1 Demonstrations Protests Peaceful protest Protesters (Mauritania) Labor Group (Mauritania); Women (Mauritania) Protesters ... Nouadhibou 20.9434 -17.0380 1 Al Akhbar (Mauritania) National On 30 June 2025, a large number of women fish ... 0 crowd size=large 1751941686
50 MEX106699 30 June 2025 2025 1 Demonstrations Protests Peaceful protest Protesters (Mexico) Women (Mexico) Protesters ... Mazatlan 23.2003 -106.4222 1 Noroeste Subnational On 30 June 2025, in Mazatlan, Sinaloa, almost ... 0 crowd size=almost 100 1751941687
78 TUR45013 30 June 2025 2025 1 Demonstrations Protests Peaceful protest Protesters (Turkey) Lawyers (Turkey); Women (Turkey) Protesters ... Elazig 38.6743 39.2232 1 Evrensel National On 30 June 2025, women and lawyers gathered in... 0 crowd size=no report 1751941772
188 IND188094 30 June 2025 2025 1 Demonstrations Protests Peaceful protest Protesters (India) Sonowal Kachari Ethnic Group (India); Students... Protesters ... Dibrugarh 27.4727 94.9121 1 Pratidin Time; Times of India Subnational-National On 30 June 2025, All Assam Sonowal Kachari Stu... 0 crowd size=massive 1752014372
192 IND188187 30 June 2025 2025 1 Demonstrations Protests Peaceful protest Protesters (India) AAP: Aam Aadmi Party; Former Government of Ind... Protesters ... Delhi - Rajouri Garden 28.6331 77.1051 1 Amar Ujala; Deccan Chronicle National On 30 June 2025, AAP MLAs, including the AAP D... 0 crowd size=no report 1752014373

5 rows × 31 columns

In [86]:
# Apply the topic model to the 'notes' column of filtered data
documents = df_filtered["notes"].astype(str).tolist()
topics, probs = topic_model.transform(documents)

# Add topic results to the filtered DataFrame
df_filtered["topic"] = topics
df_filtered["probability"] = probs

# Get and clean topic labels
raw_labels = topic_model.get_topic_info()[["Topic", "Name"]].set_index("Topic")["Name"].to_dict()
clean_labels = {
    topic_id: (
        label.split("_", 1)[1] if topic_id != -1 and "_" in label else "Unassigned"
    )
    for topic_id, label in raw_labels.items()
}
df_filtered["topic_label"] = df_filtered["topic"].map(clean_labels)
Batches:   0%|          | 0/16 [00:00<?, ?it/s]
2025-07-16 15:14:45,704 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-16 15:14:46,236 - BERTopic - Dimensionality - Completed ✓
2025-07-16 15:14:46,236 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-16 15:14:46,261 - BERTopic - Cluster - Completed ✓
In [108]:
# Count how many times each topic appears in the filtered data
topic_counts = df_women_filtered["topic"].value_counts().to_dict()

# Add the count column to each row
df_women_filtered["count"] = df_women_filtered["topic"].map(topic_counts)
In [110]:
# Filter out rows with topic -1 (unassigned)
df_women_filtered = df_filtered[df_filtered["topic"] != -1].copy()

# Count how many times each topic appears
topic_counts = df_women_filtered["topic"].value_counts().to_dict()
df_women_filtered["count"] = df_women_filtered["topic"].map(topic_counts)

# Show the first 5 results
print(df_women_filtered[["country", "topic", "topic_label", "count", "probability"]].sample(5))
             country  topic                 topic_label  count  probability
5096           Yemen    165  houthi_enemy_zionist_stand      6     0.257061
10743  United States     19     gun_moms_control_action     12     0.897034
5074          Turkey    125  ihd_saturday_taksim_weekly      1     0.690430
3055   United States     29      trump_musk_elon_donald     64     0.576834
4673         Nigeria    135  akoko_community_fulani_lga      2     1.000000
In [116]:
# Save cleaned file
df_women_filtered[["country", "topic", "topic_label", "count", "probability"]].to_csv("Output/Jun25Output.csv", index=False)